In [98]:
import warnings
warnings.filterwarnings('ignore')


import os
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import  precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.model_selection import cross_val_score
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.decomposition import PCA
import gc
In [99]:
#--------------------------------------------------------------------------------------
# Visualization of measurement of results
def plot_metrics_from_wide_df(gram_name,results_df):
    """
    Plots all metrics from a wide-format DataFrame.

    Args:
        results_df (DataFrame): Wide-format DataFrame with columns:
                                - 'model': Model name.
                                - 'sampling': Sampling method.
                                - Metric columns (e.g., 'acc', 'prec', 'recall').
    """
    # Metrik sütunlarını seç (model ve sampling haricindeki tüm sütunlar metrik olarak kabul edilir)
    metrics = [col for col in results_df.columns if col not in ['model', 'sampling']]

    for metric in metrics:
        fig, ax = plt.subplots(figsize=(12, 8))
        
        # Pivot tablosu oluÅŸtur
        pivoted_data = results_df.pivot(index='sampling', columns='model', values=metric)
        
        # Çubuk grafik çiz
        pivoted_data.plot(kind='bar', ax=ax, alpha=0.8, edgecolor='black')
        plt.title(f'{gram_name} {metric.replace("_", " ").capitalize()} by Model and Sampling')
        plt.ylabel(metric.replace("_", " ").capitalize())
        plt.xlabel('Sampling Method')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Model')
        plt.tight_layout()
        plt.show()
In [100]:
def plot_class_distribution(y,title):
    unique_name_count = y.value_counts()
    print(unique_name_count)
    
    etiket_mapping = {0:'hiçbiri', 1:'nefret', 2:'saldırgan'}

    # Index'i sayısal değerlere eşleme
    unique_name_count.index = unique_name_count.index.map(etiket_mapping)
    # Çubuk grafik çizimi
    plt.bar(unique_name_count.index.tolist(), unique_name_count, color='skyblue', edgecolor='black')
    plt.xticks(rotation=90)
    plt.title(f"{title} dataset")
    plt.xlabel("unique")
    plt.ylabel("number")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()
In [101]:
def evaluate_model_with_metrics(gram_name,model_name,sampling,model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Cross-validation scores for training accuracy
    train_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    training_accuracy = np.mean(train_scores)

    # Validation accuracy on test set
    y_pred = model.predict(X_test)

    y_pred = y_pred.ravel()  # or y_pred.flatten()
    validation_accuracy = accuracy_score(y_test, y_pred)

    # Additional metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    f1 = f1_score(y_test, y_pred,average='weighted')
    print(f"\n{model_name},{sampling},{gram_name}:")
    print("accuracy:",accuracy)
    print("precision:",precision)
    print("recall:",recall)
    print("f1:",f1)
    print("Training Accuracy:", training_accuracy)
    print("Validation Accuracy:", validation_accuracy)

    return accuracy,precision, recall, f1, training_accuracy, validation_accuracy
In [102]:
#-------------------------------------------------------------------------
def feature_unionAndmodel_training(gram_name,feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels):
    
    
    X_train_features = feature_union.fit_transform(X_train)
    X_test_features = feature_union.transform(X_test)
    
    
    plot_class_distribution(y_train, f'{gram_name} Original Training Data Class Distribution')
    print("Original Training Data Class Distributionı:", Counter(y_train))
    
    under_sampler = RandomUnderSampler(random_state=42)
    X_train_under, y_train_under = under_sampler.fit_resample(X_train_features, y_train)
    plot_class_distribution(y_train_under,f'{gram_name} After Under-Sampling,Training Data Class Distribution')
    print("After Under-Sampling,Training Data Class Distribution:", Counter(y_train_under))
    
    over_sampler = RandomOverSampler(random_state=42)
    X_train_over, y_train_over = over_sampler.fit_resample(X_train_features, y_train)
    print("After Over-Sampling, Training Data Class Distribution:", Counter(y_train_over))
    plot_class_distribution(y_train_over,f'{gram_name} After Over-Sampling, Training Data Class Distribution')
    
    smote = SMOTE(sampling_strategy='auto',random_state=42, k_neighbors=2)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_features, y_train,)
    print("After SMOTE ,Training Data Class Distributionı:", Counter(y_train_smote))
    plot_class_distribution(y_train_smote, f'{gram_name} After SMOTE ,Training Data Class Distributionı')

    
    
    # Enhanced results dictionary
    results = {
        'model': [],
        'sampling': [],
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1_score': [],
        'training_accuracy': [],
        'validation_accuracy': []
    }

    
#ML ALG.
    # Iterate through models and sampling methods
    for X_train_res, y_train_res, sampling in [
        (X_train_under, y_train_under, 'Under-Sampling'),
        (X_train_over, y_train_over, 'Over-Sampling'),
        (X_train_smote, y_train_smote, 'SMOTE')
    ]:
        for model_name, model in modelsAndNames:
            print(f"\n{gram_name}:")
            accuracy,precision, recall, f1, training_accuracy, validation_accuracy = evaluate_model_with_metrics(
               gram_name,model_name,sampling,
               model,
               X_train_res, y_train_res,
               X_test_features, y_test
            )
            
            results['model'].append(model_name)
            results['sampling'].append(sampling)
            results['accuracy'].append(accuracy)
            results['precision'].append(precision)
            results['recall'].append(recall)
            results['f1_score'].append(f1)
            results['training_accuracy'].append(training_accuracy)
            results['validation_accuracy'].append(validation_accuracy)
            
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    # Display results
    print(results_df)
    # Call the plotting function
    plot_metrics_from_wide_df(gram_name,results_df)
    results_df.to_csv(f'{gram_name}.csv', index=False)
    return results_df
In [103]:
import warnings
warnings.filterwarnings('ignore')
import sklearn
print(sklearn.__version__)
import numpy as np
print(np.__version__)


#import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from collections import Counter
import xgboost as xgb
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
import csv
from catboost import CatBoostClassifier




try:
    data = pd.read_excel("temizlenmis_ve_islenmis_veriler.xlsx", engine='openpyxl') 
except Exception as e:
    print(f"Dönüştürme sırasında bir hata oluştu: {e}")
    

#data = pd.read_csv("data_Llama.csv")     
data = data.fillna("hiçbiri")
#data.drop(columns=['row_id'], inplace=True)# row index aynı indexte olan değerler,aynı değilleri.


data = data[["tweet","etiket"]]
print(data['etiket'].value_counts())
 
tweets = data['tweet']
labels = data["etiket"]

#--------------------------------------------------------------------------------------
print("Orijinal veri sınıf dağılımı:", Counter(labels))
1.4.2
1.26.4
etiket
hiçbiri      7722
nefret       2336
saldırgan     166
Name: count, dtype: int64
Orijinal veri sınıf dağılımı: Counter({'hiçbiri': 7722, 'nefret': 2336, 'saldırgan': 166})
In [104]:
# Modeller--------------------------------------------------------------------------------------
xgb_model = xgb.XGBClassifier(tree_method='gpu_hist',max_depth=6, gpu_id=0, random_state=42, use_label_encoder=False,  n_estimators=50)
catB_model = CatBoostClassifier(iterations=50, learning_rate=0.1, depth=4, verbose=0)# (XGBoost ve CatBoost)
ann_mlpc_sgd = MLPClassifier(solver='sgd', alpha=1e-5, activation='relu',  hidden_layer_sizes=(20, 10, 5),max_iter=50, learning_rate='adaptive', random_state=7)# MLPClassifier (Yapay Sinir Ağı Sınıflandırıcı)
extraTC = ExtraTreesClassifier(n_estimators=100, random_state=7)# ExtraTreesClassifier (Ekstra Karar Ağaçları)


modelsAndNames =   [
       ('XGBoost', xgb_model),
       ('CatBoostC', catB_model),
       ('MLPC-sgd', ann_mlpc_sgd),
       ('ExtraTreesClassifier', extraTC)
   ]
#--------------------------------------------------------------------------------------
In [105]:
# Kategorik etiketleri sayısal verilere dönüştürme
label_mapping = {'nefret': 1, 'hiçbiri': 0, 'saldırgan': 2}
labels = data['etiket'].map(label_mapping)

#
result_dic ={}
    
len_labels = len(set(labels)) #for num_classes
# Veriyi eğitim ve test setine ayırma
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)
In [106]:
#word unigram and bigram ,char bigram and trigram------------------------------

word_unigram_count = CountVectorizer(ngram_range=(1, 1), analyzer='word')  # Kelime bazlı unigram
word_unigram_tfidf = TfidfVectorizer(ngram_range=(1, 1), analyzer='word')  # Kelime bazlı unigram TF-IDF

word_bigram_count = CountVectorizer(ngram_range=(2, 2), analyzer='word')  # Kelime bazlı bigram
word_bigram_tfidf = TfidfVectorizer(ngram_range=(2, 2), analyzer='word')  # Kelime bazlı bigram TF-IDF

# Karakter bazlı bigram tanımları
char_bigram_count = CountVectorizer(ngram_range=(2, 2), analyzer='char')  # Karakter bazlı bigram
char_bigram_tfidf = TfidfVectorizer(ngram_range=(2, 2), analyzer='char')  # Karakter bazlı bigram TF-IDF

# Karakter bazlı trigram tanımları
char_trigram_count = CountVectorizer(ngram_range=(3, 3), analyzer='char')  # Karakter bazlı trigram
char_trigram_tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer='char')  # Karakter bazlı trigram TF-IDF

word unigram-------------------------------------------¶

In [108]:
feature_union = FeatureUnion([
    ("word_unigram_count", word_unigram_count),
    ("word_unigram_tfidf", word_unigram_tfidf)
])
In [109]:
results_df = feature_unionAndmodel_training("Word_Unigram",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["Word_Unigram"] = results_df
etiket
0    6195
1    1846
2     138
Name: count, dtype: int64
No description has been provided for this image
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0    138
1    138
2    138
Name: count, dtype: int64
No description has been provided for this image
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
Word_Unigram:

XGBoost,Under-Sampling,Word_Unigram:
accuracy: 0.5095354523227383
precision: 0.6575901117758006
recall: 0.5095354523227383
f1: 0.5646169098292118
Training Accuracy: 0.3695652173913044
Validation Accuracy: 0.5095354523227383

Word_Unigram:

CatBoostC,Under-Sampling,Word_Unigram:
accuracy: 0.6440097799511002
precision: 0.7080178858853468
recall: 0.6440097799511002
f1: 0.6674804861339697
Training Accuracy: 0.47826086956521735
Validation Accuracy: 0.6440097799511002

Word_Unigram:

MLPC-sgd,Under-Sampling,Word_Unigram:
accuracy: 0.7466992665036675
precision: 0.557559794597115
recall: 0.7466992665036675
f1: 0.6384153303197649
Training Accuracy: 0.3333333333333333
Validation Accuracy: 0.7466992665036675

Word_Unigram:

ExtraTreesClassifier,Under-Sampling,Word_Unigram:
accuracy: 0.6332518337408313
precision: 0.7261700861211683
recall: 0.6332518337408313
f1: 0.6652686178056041
Training Accuracy: 0.47826086956521735
Validation Accuracy: 0.6332518337408313

Word_Unigram:

XGBoost,Over-Sampling,Word_Unigram:
accuracy: 0.6381418092909535
precision: 0.7184951030066614
recall: 0.6381418092909535
f1: 0.6588627681405055
Training Accuracy: 0.4382566585956416
Validation Accuracy: 0.6381418092909535

Word_Unigram:

CatBoostC,Over-Sampling,Word_Unigram:
accuracy: 0.6581907090464547
precision: 0.727098286516982
recall: 0.6581907090464547
f1: 0.6823021112184453
Training Accuracy: 0.6886736615550175
Validation Accuracy: 0.6581907090464547

Word_Unigram:

MLPC-sgd,Over-Sampling,Word_Unigram:
accuracy: 0.8019559902200489
precision: 0.7874559613439057
recall: 0.8019559902200489
f1: 0.7918328320509477
Training Accuracy: 0.9390368576809255
Validation Accuracy: 0.8019559902200489

Word_Unigram:

ExtraTreesClassifier,Over-Sampling,Word_Unigram:
accuracy: 0.8273838630806846
precision: 0.8211581691741373
recall: 0.8273838630806846
f1: 0.7955818168049512
Training Accuracy: 0.9839117567931127
Validation Accuracy: 0.8273838630806846

Word_Unigram:

XGBoost,SMOTE,Word_Unigram:
accuracy: 0.6205378973105135
precision: 0.6508394808079272
recall: 0.6205378973105135
f1: 0.6090517132553599
Training Accuracy: 0.455044390637611
Validation Accuracy: 0.6205378973105135

Word_Unigram:

CatBoostC,SMOTE,Word_Unigram:
accuracy: 0.7403422982885085
precision: 0.7170896335321763
recall: 0.7403422982885085
f1: 0.7249969379249598
Training Accuracy: 0.8239978477266613
Validation Accuracy: 0.7403422982885085

Word_Unigram:

MLPC-sgd,SMOTE,Word_Unigram:
accuracy: 0.8058679706601467
precision: 0.791429485059973
recall: 0.8058679706601467
f1: 0.7953933164260065
Training Accuracy: 0.933871401668012
Validation Accuracy: 0.8058679706601467

Word_Unigram:

ExtraTreesClassifier,SMOTE,Word_Unigram:
accuracy: 0.8273838630806846
precision: 0.8228334102571414
recall: 0.8273838630806846
f1: 0.7993982645123894
Training Accuracy: 0.9652407855797686
Validation Accuracy: 0.8273838630806846
                   model        sampling  accuracy  precision    recall  \
0                XGBoost  Under-Sampling  0.509535   0.657590  0.509535   
1              CatBoostC  Under-Sampling  0.644010   0.708018  0.644010   
2               MLPC-sgd  Under-Sampling  0.746699   0.557560  0.746699   
3   ExtraTreesClassifier  Under-Sampling  0.633252   0.726170  0.633252   
4                XGBoost   Over-Sampling  0.638142   0.718495  0.638142   
5              CatBoostC   Over-Sampling  0.658191   0.727098  0.658191   
6               MLPC-sgd   Over-Sampling  0.801956   0.787456  0.801956   
7   ExtraTreesClassifier   Over-Sampling  0.827384   0.821158  0.827384   
8                XGBoost           SMOTE  0.620538   0.650839  0.620538   
9              CatBoostC           SMOTE  0.740342   0.717090  0.740342   
10              MLPC-sgd           SMOTE  0.805868   0.791429  0.805868   
11  ExtraTreesClassifier           SMOTE  0.827384   0.822833  0.827384   

    f1_score  training_accuracy  validation_accuracy  
0   0.564617           0.369565             0.509535  
1   0.667480           0.478261             0.644010  
2   0.638415           0.333333             0.746699  
3   0.665269           0.478261             0.633252  
4   0.658863           0.438257             0.638142  
5   0.682302           0.688674             0.658191  
6   0.791833           0.939037             0.801956  
7   0.795582           0.983912             0.827384  
8   0.609052           0.455044             0.620538  
9   0.724997           0.823998             0.740342  
10  0.795393           0.933871             0.805868  
11  0.799398           0.965241             0.827384  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Word bigram-------------------------------------------¶

In [111]:
feature_union = FeatureUnion([
    ("word_bigram_count", word_bigram_count),
    ("word_bigram_tfidf", word_bigram_tfidf)
])
In [112]:
feature_unionAndmodel_training("Word_bigram",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["Word_bigram"] = results_df
etiket
0    6195
1    1846
2     138
Name: count, dtype: int64
No description has been provided for this image
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0    138
1    138
2    138
Name: count, dtype: int64
No description has been provided for this image
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
Word_bigram:

XGBoost,Under-Sampling,Word_bigram:
accuracy: 0.7447432762836186
precision: 0.5574622465122969
recall: 0.7447432762836186
f1: 0.6376355384833673
Training Accuracy: 0.36231884057971014
Validation Accuracy: 0.7447432762836186

Word_bigram:

CatBoostC,Under-Sampling,Word_bigram:
accuracy: 0.7383863080684596
precision: 0.7182721795030548
recall: 0.7383863080684596
f1: 0.6391073421502371
Training Accuracy: 0.35748792270531404
Validation Accuracy: 0.7383863080684596

Word_bigram:

MLPC-sgd,Under-Sampling,Word_bigram:
accuracy: 0.013691931540342298
precision: 0.0001874689893054202
recall: 0.013691931540342298
f1: 0.0003698736933232844
Training Accuracy: 0.3333333333333333
Validation Accuracy: 0.013691931540342298

Word_bigram:

ExtraTreesClassifier,Under-Sampling,Word_bigram:
accuracy: 0.3095354523227384
precision: 0.6727032185641018
recall: 0.3095354523227384
f1: 0.2534120249048235
Training Accuracy: 0.3913043478260869
Validation Accuracy: 0.3095354523227384

Word_bigram:

XGBoost,Over-Sampling,Word_bigram:
accuracy: 0.2396088019559902
precision: 0.05741237797478494
recall: 0.2396088019559902
f1: 0.09262983270882462
Training Accuracy: 0.3449018025289212
Validation Accuracy: 0.2396088019559902

Word_bigram:

CatBoostC,Over-Sampling,Word_bigram:
accuracy: 0.2371638141809291
precision: 0.6181486862841096
recall: 0.2371638141809291
f1: 0.09890916658396938
Training Accuracy: 0.6075329566854991
Validation Accuracy: 0.2371638141809291

Word_bigram:

MLPC-sgd,Over-Sampling,Word_bigram:
accuracy: 0.7383863080684596
precision: 0.6230981692527701
recall: 0.7383863080684596
f1: 0.6440928476812778
Training Accuracy: 0.664030131826742
Validation Accuracy: 0.7383863080684596

Word_bigram:

ExtraTreesClassifier,Over-Sampling,Word_bigram:
accuracy: 0.7897310513447433
precision: 0.7858351199314055
recall: 0.7897310513447433
f1: 0.7393722747828305
Training Accuracy: 0.9837503362927092
Validation Accuracy: 0.7897310513447433

Word_bigram:

XGBoost,SMOTE,Word_bigram:
accuracy: 0.24009779951100244
precision: 0.8041397327259939
recall: 0.24009779951100244
f1: 0.09364374255766049
Training Accuracy: 0.39009954264191543
Validation Accuracy: 0.24009779951100244

Word_bigram:

CatBoostC,SMOTE,Word_bigram:
accuracy: 0.7506112469437652
precision: 0.7995415197371188
recall: 0.7506112469437652
f1: 0.6484315697478267
Training Accuracy: 0.8844767285445251
Validation Accuracy: 0.7506112469437652

Word_bigram:

MLPC-sgd,SMOTE,Word_bigram:
accuracy: 0.7452322738386308
precision: 0.6822035740789999
recall: 0.7452322738386308
f1: 0.6498933479503115
Training Accuracy: 0.7736346516007533
Validation Accuracy: 0.7452322738386308

Word_bigram:

ExtraTreesClassifier,SMOTE,Word_bigram:
accuracy: 0.7887530562347188
precision: 0.7842287144906595
recall: 0.7887530562347188
f1: 0.7387461571157363
Training Accuracy: 0.9511433952111918
Validation Accuracy: 0.7887530562347188
                   model        sampling  accuracy  precision    recall  \
0                XGBoost  Under-Sampling  0.744743   0.557462  0.744743   
1              CatBoostC  Under-Sampling  0.738386   0.718272  0.738386   
2               MLPC-sgd  Under-Sampling  0.013692   0.000187  0.013692   
3   ExtraTreesClassifier  Under-Sampling  0.309535   0.672703  0.309535   
4                XGBoost   Over-Sampling  0.239609   0.057412  0.239609   
5              CatBoostC   Over-Sampling  0.237164   0.618149  0.237164   
6               MLPC-sgd   Over-Sampling  0.738386   0.623098  0.738386   
7   ExtraTreesClassifier   Over-Sampling  0.789731   0.785835  0.789731   
8                XGBoost           SMOTE  0.240098   0.804140  0.240098   
9              CatBoostC           SMOTE  0.750611   0.799542  0.750611   
10              MLPC-sgd           SMOTE  0.745232   0.682204  0.745232   
11  ExtraTreesClassifier           SMOTE  0.788753   0.784229  0.788753   

    f1_score  training_accuracy  validation_accuracy  
0   0.637636           0.362319             0.744743  
1   0.639107           0.357488             0.738386  
2   0.000370           0.333333             0.013692  
3   0.253412           0.391304             0.309535  
4   0.092630           0.344902             0.239609  
5   0.098909           0.607533             0.237164  
6   0.644093           0.664030             0.738386  
7   0.739372           0.983750             0.789731  
8   0.093644           0.390100             0.240098  
9   0.648432           0.884477             0.750611  
10  0.649893           0.773635             0.745232  
11  0.738746           0.951143             0.788753  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

char bigram-------------------------------------------¶

In [114]:
feature_union = FeatureUnion([
    ("char_bigram_count", char_bigram_count),
    ("char_bigram_tfidf", char_bigram_tfidf)
])
In [115]:
feature_unionAndmodel_training("char_bigram",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["char_bigram"] = results_df
etiket
0    6195
1    1846
2     138
Name: count, dtype: int64
No description has been provided for this image
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0    138
1    138
2    138
Name: count, dtype: int64
No description has been provided for this image
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
char_bigram:

XGBoost,Under-Sampling,char_bigram:
accuracy: 0.6264058679706601
precision: 0.6347017545271905
recall: 0.6264058679706601
f1: 0.6303623506936571
Training Accuracy: 0.3333333333333333
Validation Accuracy: 0.6264058679706601

char_bigram:

CatBoostC,Under-Sampling,char_bigram:
accuracy: 0.5119804400977995
precision: 0.7012931523157435
recall: 0.5119804400977995
f1: 0.5733309891699646
Training Accuracy: 0.45893719806763283
Validation Accuracy: 0.5119804400977995

char_bigram:

MLPC-sgd,Under-Sampling,char_bigram:
accuracy: 0.20244498777506112
precision: 0.6621240364270823
recall: 0.20244498777506112
f1: 0.24674215185157422
Training Accuracy: 0.38647342995169076
Validation Accuracy: 0.20244498777506112

char_bigram:

ExtraTreesClassifier,Under-Sampling,char_bigram:
accuracy: 0.49339853300733494
precision: 0.7144449205645036
recall: 0.49339853300733494
f1: 0.5577858356038657
Training Accuracy: 0.5144927536231885
Validation Accuracy: 0.49339853300733494

char_bigram:

XGBoost,Over-Sampling,char_bigram:
accuracy: 0.2938875305623472
precision: 0.6309468584679232
recall: 0.2938875305623472
f1: 0.22549109881644047
Training Accuracy: 0.33440947000269033
Validation Accuracy: 0.2938875305623472

char_bigram:

CatBoostC,Over-Sampling,char_bigram:
accuracy: 0.6669926650366749
precision: 0.7481551692230886
recall: 0.6669926650366749
f1: 0.6970050768681457
Training Accuracy: 0.7357546408393866
Validation Accuracy: 0.6669926650366749

char_bigram:

MLPC-sgd,Over-Sampling,char_bigram:
accuracy: 0.7726161369193154
precision: 0.780073910877911
recall: 0.7726161369193154
f1: 0.7761152173651298
Training Accuracy: 0.8566585956416466
Validation Accuracy: 0.7726161369193154

char_bigram:

ExtraTreesClassifier,Over-Sampling,char_bigram:
accuracy: 0.7628361858190709
precision: 0.8063134639595191
recall: 0.7628361858190709
f1: 0.6746057809467082
Training Accuracy: 0.9862254506322303
Validation Accuracy: 0.7628361858190709

char_bigram:

XGBoost,SMOTE,char_bigram:
accuracy: 0.2396088019559902
precision: 0.05741237797478494
recall: 0.2396088019559902
f1: 0.09262983270882462
Training Accuracy: 0.36863061608824316
Validation Accuracy: 0.2396088019559902

char_bigram:

CatBoostC,SMOTE,char_bigram:
accuracy: 0.7735941320293398
precision: 0.7528107491638911
recall: 0.7735941320293398
f1: 0.7411515592040571
Training Accuracy: 0.8174334140435836
Validation Accuracy: 0.7735941320293398

char_bigram:

MLPC-sgd,SMOTE,char_bigram:
accuracy: 0.776039119804401
precision: 0.7779083451197722
recall: 0.776039119804401
f1: 0.7768841825426179
Training Accuracy: 0.8709712133440947
Validation Accuracy: 0.776039119804401

char_bigram:

ExtraTreesClassifier,SMOTE,char_bigram:
accuracy: 0.7799511002444988
precision: 0.7956635550575962
recall: 0.7799511002444988
f1: 0.7144830039226291
Training Accuracy: 0.9640570352434757
Validation Accuracy: 0.7799511002444988
                   model        sampling  accuracy  precision    recall  \
0                XGBoost  Under-Sampling  0.626406   0.634702  0.626406   
1              CatBoostC  Under-Sampling  0.511980   0.701293  0.511980   
2               MLPC-sgd  Under-Sampling  0.202445   0.662124  0.202445   
3   ExtraTreesClassifier  Under-Sampling  0.493399   0.714445  0.493399   
4                XGBoost   Over-Sampling  0.293888   0.630947  0.293888   
5              CatBoostC   Over-Sampling  0.666993   0.748155  0.666993   
6               MLPC-sgd   Over-Sampling  0.772616   0.780074  0.772616   
7   ExtraTreesClassifier   Over-Sampling  0.762836   0.806313  0.762836   
8                XGBoost           SMOTE  0.239609   0.057412  0.239609   
9              CatBoostC           SMOTE  0.773594   0.752811  0.773594   
10              MLPC-sgd           SMOTE  0.776039   0.777908  0.776039   
11  ExtraTreesClassifier           SMOTE  0.779951   0.795664  0.779951   

    f1_score  training_accuracy  validation_accuracy  
0   0.630362           0.333333             0.626406  
1   0.573331           0.458937             0.511980  
2   0.246742           0.386473             0.202445  
3   0.557786           0.514493             0.493399  
4   0.225491           0.334409             0.293888  
5   0.697005           0.735755             0.666993  
6   0.776115           0.856659             0.772616  
7   0.674606           0.986225             0.762836  
8   0.092630           0.368631             0.239609  
9   0.741152           0.817433             0.773594  
10  0.776884           0.870971             0.776039  
11  0.714483           0.964057             0.779951  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

char trigram-------------------------------------------¶

In [117]:
feature_union = FeatureUnion([
    ("char_trigram_count", char_trigram_count),
    ("char_trigram_tfidf", char_trigram_tfidf)
])
In [118]:
feature_unionAndmodel_training("char_trigram",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["char_trigram"] = results_df
etiket
0    6195
1    1846
2     138
Name: count, dtype: int64
No description has been provided for this image
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0    138
1    138
2    138
Name: count, dtype: int64
No description has been provided for this image
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
char_trigram:

XGBoost,Under-Sampling,char_trigram:
accuracy: 0.7437652811735941
precision: 0.5575501150476575
recall: 0.7437652811735941
f1: 0.637334222419797
Training Accuracy: 0.3357487922705314
Validation Accuracy: 0.7437652811735941

char_trigram:

CatBoostC,Under-Sampling,char_trigram:
accuracy: 0.5266503667481662
precision: 0.6784295980459428
recall: 0.5266503667481662
f1: 0.5809084261817494
Training Accuracy: 0.4855072463768116
Validation Accuracy: 0.5266503667481662

char_trigram:

MLPC-sgd,Under-Sampling,char_trigram:
accuracy: 0.5114914425427873
precision: 0.6981192263978314
recall: 0.5114914425427873
f1: 0.56881666496536
Training Accuracy: 0.4106280193236715
Validation Accuracy: 0.5114914425427873

char_trigram:

ExtraTreesClassifier,Under-Sampling,char_trigram:
accuracy: 0.556479217603912
precision: 0.740135399322862
recall: 0.556479217603912
f1: 0.6139464031302234
Training Accuracy: 0.533816425120773
Validation Accuracy: 0.556479217603912

char_trigram:

XGBoost,Over-Sampling,char_trigram:
accuracy: 0.29682151589242056
precision: 0.6583538870591151
recall: 0.29682151589242056
f1: 0.3057447151651328
Training Accuracy: 0.3418348130212537
Validation Accuracy: 0.29682151589242056

char_trigram:

CatBoostC,Over-Sampling,char_trigram:
accuracy: 0.6963325183374083
precision: 0.753973885412959
recall: 0.6963325183374083
f1: 0.7193417392948225
Training Accuracy: 0.7485068603712671
Validation Accuracy: 0.6963325183374083

char_trigram:

MLPC-sgd,Over-Sampling,char_trigram:
accuracy: 0.832762836185819
precision: 0.8285802939423141
recall: 0.832762836185819
f1: 0.8303986865798468
Training Accuracy: 0.9494753833736885
Validation Accuracy: 0.832762836185819

char_trigram:

ExtraTreesClassifier,Over-Sampling,char_trigram:
accuracy: 0.8024449877750611
precision: 0.8251092212835346
recall: 0.8024449877750611
f1: 0.7510225224076662
Training Accuracy: 0.9863868711326339
Validation Accuracy: 0.8024449877750611

char_trigram:

XGBoost,SMOTE,char_trigram:
accuracy: 0.2396088019559902
precision: 0.05741237797478494
recall: 0.2396088019559902
f1: 0.09262983270882462
Training Accuracy: 0.39892386333064306
Validation Accuracy: 0.2396088019559902

char_trigram:

CatBoostC,SMOTE,char_trigram:
accuracy: 0.767237163814181
precision: 0.7453820528076922
recall: 0.767237163814181
f1: 0.749831730620505
Training Accuracy: 0.8355663169222493
Validation Accuracy: 0.767237163814181

char_trigram:

MLPC-sgd,SMOTE,char_trigram:
accuracy: 0.8352078239608802
precision: 0.8331876566028675
recall: 0.8352078239608802
f1: 0.8341407203725756
Training Accuracy: 0.9404896421845574
Validation Accuracy: 0.8352078239608802

char_trigram:

ExtraTreesClassifier,SMOTE,char_trigram:
accuracy: 0.79559902200489
precision: 0.8101592616492123
recall: 0.79559902200489
f1: 0.7426270168688798
Training Accuracy: 0.9597524885660479
Validation Accuracy: 0.79559902200489
                   model        sampling  accuracy  precision    recall  \
0                XGBoost  Under-Sampling  0.743765   0.557550  0.743765   
1              CatBoostC  Under-Sampling  0.526650   0.678430  0.526650   
2               MLPC-sgd  Under-Sampling  0.511491   0.698119  0.511491   
3   ExtraTreesClassifier  Under-Sampling  0.556479   0.740135  0.556479   
4                XGBoost   Over-Sampling  0.296822   0.658354  0.296822   
5              CatBoostC   Over-Sampling  0.696333   0.753974  0.696333   
6               MLPC-sgd   Over-Sampling  0.832763   0.828580  0.832763   
7   ExtraTreesClassifier   Over-Sampling  0.802445   0.825109  0.802445   
8                XGBoost           SMOTE  0.239609   0.057412  0.239609   
9              CatBoostC           SMOTE  0.767237   0.745382  0.767237   
10              MLPC-sgd           SMOTE  0.835208   0.833188  0.835208   
11  ExtraTreesClassifier           SMOTE  0.795599   0.810159  0.795599   

    f1_score  training_accuracy  validation_accuracy  
0   0.637334           0.335749             0.743765  
1   0.580908           0.485507             0.526650  
2   0.568817           0.410628             0.511491  
3   0.613946           0.533816             0.556479  
4   0.305745           0.341835             0.296822  
5   0.719342           0.748507             0.696333  
6   0.830399           0.949475             0.832763  
7   0.751023           0.986387             0.802445  
8   0.092630           0.398924             0.239609  
9   0.749832           0.835566             0.767237  
10  0.834141           0.940490             0.835208  
11  0.742627           0.959752             0.795599  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

char bigram + char trigram-------------------------------------------¶

In [120]:
feature_union = FeatureUnion([
    ("word_bigram_count", word_bigram_count),
    ("word_bigram_tfidf", word_bigram_tfidf),
    
    ("char_trigram_count", char_trigram_count),
    ("char_trigram_tfidf", char_trigram_tfidf)
    
])
In [121]:
feature_unionAndmodel_training("word_tri+char_tri",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["word_tri+char_tri"] = results_df
etiket
0    6195
1    1846
2     138
Name: count, dtype: int64
No description has been provided for this image
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0    138
1    138
2    138
Name: count, dtype: int64
No description has been provided for this image
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
word_tri+char_tri:

XGBoost,Under-Sampling,word_tri+char_tri:
accuracy: 0.4102689486552567
precision: 0.6326681653246251
recall: 0.4102689486552567
f1: 0.4280388951475887
Training Accuracy: 0.3695652173913044
Validation Accuracy: 0.4102689486552567

word_tri+char_tri:

CatBoostC,Under-Sampling,word_tri+char_tri:
accuracy: 0.5545232273838631
precision: 0.6947910642326474
recall: 0.5545232273838631
f1: 0.605369518144594
Training Accuracy: 0.4879227053140096
Validation Accuracy: 0.5545232273838631

word_tri+char_tri:

MLPC-sgd,Under-Sampling,word_tri+char_tri:
accuracy: 0.5300733496332518
precision: 0.7261427094807171
recall: 0.5300733496332518
f1: 0.6065311265402523
Training Accuracy: 0.4806763285024154
Validation Accuracy: 0.5300733496332518

word_tri+char_tri:

ExtraTreesClassifier,Under-Sampling,word_tri+char_tri:
accuracy: 0.5491442542787286
precision: 0.747378496008514
recall: 0.5491442542787286
f1: 0.6112365787596845
Training Accuracy: 0.5386473429951691
Validation Accuracy: 0.5491442542787286

word_tri+char_tri:

XGBoost,Over-Sampling,word_tri+char_tri:
accuracy: 0.022004889975550123
precision: 0.8321424024011673
recall: 0.022004889975550123
f1: 0.019620963780231933
Training Accuracy: 0.35275760021522734
Validation Accuracy: 0.022004889975550123

word_tri+char_tri:

CatBoostC,Over-Sampling,word_tri+char_tri:
accuracy: 0.7085574572127139
precision: 0.7643728054902549
recall: 0.7085574572127139
f1: 0.7307394895252735
Training Accuracy: 0.749313962873285
Validation Accuracy: 0.7085574572127139

word_tri+char_tri:

MLPC-sgd,Over-Sampling,word_tri+char_tri:
accuracy: 0.8366748166259169
precision: 0.8320275601711585
recall: 0.8366748166259169
f1: 0.8340288412594014
Training Accuracy: 0.9533494753833738
Validation Accuracy: 0.8366748166259169

word_tri+char_tri:

ExtraTreesClassifier,Over-Sampling,word_tri+char_tri:
accuracy: 0.789242053789731
precision: 0.8176749955061983
recall: 0.789242053789731
f1: 0.7284874608985384
Training Accuracy: 0.9867635189669088
Validation Accuracy: 0.789242053789731

word_tri+char_tri:

XGBoost,SMOTE,word_tri+char_tri:
accuracy: 0.23667481662591688
precision: 0.05824082521378438
recall: 0.23667481662591688
f1: 0.09326248302091822
Training Accuracy: 0.47925746569814365
Validation Accuracy: 0.23667481662591688

word_tri+char_tri:

CatBoostC,SMOTE,word_tri+char_tri:
accuracy: 0.7750611246943765
precision: 0.7501122442982812
recall: 0.7750611246943765
f1: 0.7504386915824807
Training Accuracy: 0.8392789884315307
Validation Accuracy: 0.7750611246943765

word_tri+char_tri:

MLPC-sgd,SMOTE,word_tri+char_tri:
accuracy: 0.8366748166259169
precision: 0.830694918498291
recall: 0.8366748166259169
f1: 0.833306726716856
Training Accuracy: 0.9483454398708636
Validation Accuracy: 0.8366748166259169

word_tri+char_tri:

ExtraTreesClassifier,SMOTE,word_tri+char_tri:
accuracy: 0.7867970660146699
precision: 0.8155873610708242
recall: 0.7867970660146699
f1: 0.7238500112549312
Training Accuracy: 0.9601291364003228
Validation Accuracy: 0.7867970660146699
                   model        sampling  accuracy  precision    recall  \
0                XGBoost  Under-Sampling  0.410269   0.632668  0.410269   
1              CatBoostC  Under-Sampling  0.554523   0.694791  0.554523   
2               MLPC-sgd  Under-Sampling  0.530073   0.726143  0.530073   
3   ExtraTreesClassifier  Under-Sampling  0.549144   0.747378  0.549144   
4                XGBoost   Over-Sampling  0.022005   0.832142  0.022005   
5              CatBoostC   Over-Sampling  0.708557   0.764373  0.708557   
6               MLPC-sgd   Over-Sampling  0.836675   0.832028  0.836675   
7   ExtraTreesClassifier   Over-Sampling  0.789242   0.817675  0.789242   
8                XGBoost           SMOTE  0.236675   0.058241  0.236675   
9              CatBoostC           SMOTE  0.775061   0.750112  0.775061   
10              MLPC-sgd           SMOTE  0.836675   0.830695  0.836675   
11  ExtraTreesClassifier           SMOTE  0.786797   0.815587  0.786797   

    f1_score  training_accuracy  validation_accuracy  
0   0.428039           0.369565             0.410269  
1   0.605370           0.487923             0.554523  
2   0.606531           0.480676             0.530073  
3   0.611237           0.538647             0.549144  
4   0.019621           0.352758             0.022005  
5   0.730739           0.749314             0.708557  
6   0.834029           0.953349             0.836675  
7   0.728487           0.986764             0.789242  
8   0.093262           0.479257             0.236675  
9   0.750439           0.839279             0.775061  
10  0.833307           0.948345             0.836675  
11  0.723850           0.960129             0.786797  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

char unigram + char trigram-------------------------------------------¶

In [123]:
feature_union = FeatureUnion([
    ("word_unigram_count", word_unigram_count),
    ("word_unigram_tfidf", word_unigram_tfidf),
    
    ("char_trigram_count", char_trigram_count),
    ("char_trigram_tfidf", char_trigram_tfidf)
    
])
In [124]:
feature_unionAndmodel_training("char_uni+char_tri",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["char_uni+char_tri"] = results_df
etiket
0    6195
1    1846
2     138
Name: count, dtype: int64
No description has been provided for this image
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0    138
1    138
2    138
Name: count, dtype: int64
No description has been provided for this image
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
char_uni+char_tri:

XGBoost,Under-Sampling,char_uni+char_tri:
accuracy: 0.6400977995110024
precision: 0.5871215502758061
recall: 0.6400977995110024
f1: 0.610624168789445
Training Accuracy: 0.37198067632850246
Validation Accuracy: 0.6400977995110024

char_uni+char_tri:

CatBoostC,Under-Sampling,char_uni+char_tri:
accuracy: 0.5202933985330074
precision: 0.6954071506598201
recall: 0.5202933985330074
f1: 0.5789420190665412
Training Accuracy: 0.4879227053140096
Validation Accuracy: 0.5202933985330074

char_uni+char_tri:

MLPC-sgd,Under-Sampling,char_uni+char_tri:
accuracy: 0.4356968215158924
precision: 0.6739252814384025
recall: 0.4356968215158924
f1: 0.5167196176640665
Training Accuracy: 0.4057971014492754
Validation Accuracy: 0.4356968215158924

char_uni+char_tri:

ExtraTreesClassifier,Under-Sampling,char_uni+char_tri:
accuracy: 0.5677261613691932
precision: 0.7466939546285011
recall: 0.5677261613691932
f1: 0.6234122817232957
Training Accuracy: 0.5652173913043478
Validation Accuracy: 0.5677261613691932

char_uni+char_tri:

XGBoost,Over-Sampling,char_uni+char_tri:
accuracy: 0.019559902200488997
precision: 0.7837063693019813
recall: 0.019559902200488997
f1: 0.014767866963510711
Training Accuracy: 0.3403282216841539
Validation Accuracy: 0.019559902200488997

char_uni+char_tri:

CatBoostC,Over-Sampling,char_uni+char_tri:
accuracy: 0.6992665036674817
precision: 0.7588717427719135
recall: 0.6992665036674817
f1: 0.7222379983523248
Training Accuracy: 0.7532956685499057
Validation Accuracy: 0.6992665036674817

char_uni+char_tri:

MLPC-sgd,Over-Sampling,char_uni+char_tri:
accuracy: 0.8283618581907091
precision: 0.8202322560693016
recall: 0.8283618581907091
f1: 0.8238097105427027
Training Accuracy: 0.9495291902071563
Validation Accuracy: 0.8283618581907091

char_uni+char_tri:

ExtraTreesClassifier,Over-Sampling,char_uni+char_tri:
accuracy: 0.7975550122249389
precision: 0.8130472030199646
recall: 0.7975550122249389
f1: 0.7445224962018224
Training Accuracy: 0.9868173258003766
Validation Accuracy: 0.7975550122249389

char_uni+char_tri:

XGBoost,SMOTE,char_uni+char_tri:
accuracy: 0.23374083129584353
precision: 0.05819951265725905
recall: 0.23374083129584353
f1: 0.09318732497031335
Training Accuracy: 0.43938660209846647
Validation Accuracy: 0.23374083129584353

char_uni+char_tri:

CatBoostC,SMOTE,char_uni+char_tri:
accuracy: 0.7726161369193154
precision: 0.7484608430347525
recall: 0.7726161369193154
f1: 0.7488497970840255
Training Accuracy: 0.8409470002690341
Validation Accuracy: 0.7726161369193154

char_uni+char_tri:

MLPC-sgd,SMOTE,char_uni+char_tri:
accuracy: 0.8273838630806846
precision: 0.8209420578880833
recall: 0.8273838630806846
f1: 0.8237890028579609
Training Accuracy: 0.940973903685768
Validation Accuracy: 0.8273838630806846

char_uni+char_tri:

ExtraTreesClassifier,SMOTE,char_uni+char_tri:
accuracy: 0.7970660146699267
precision: 0.8126595086967541
recall: 0.7970660146699267
f1: 0.7440900031427411
Training Accuracy: 0.9607210115684692
Validation Accuracy: 0.7970660146699267
                   model        sampling  accuracy  precision    recall  \
0                XGBoost  Under-Sampling  0.640098   0.587122  0.640098   
1              CatBoostC  Under-Sampling  0.520293   0.695407  0.520293   
2               MLPC-sgd  Under-Sampling  0.435697   0.673925  0.435697   
3   ExtraTreesClassifier  Under-Sampling  0.567726   0.746694  0.567726   
4                XGBoost   Over-Sampling  0.019560   0.783706  0.019560   
5              CatBoostC   Over-Sampling  0.699267   0.758872  0.699267   
6               MLPC-sgd   Over-Sampling  0.828362   0.820232  0.828362   
7   ExtraTreesClassifier   Over-Sampling  0.797555   0.813047  0.797555   
8                XGBoost           SMOTE  0.233741   0.058200  0.233741   
9              CatBoostC           SMOTE  0.772616   0.748461  0.772616   
10              MLPC-sgd           SMOTE  0.827384   0.820942  0.827384   
11  ExtraTreesClassifier           SMOTE  0.797066   0.812660  0.797066   

    f1_score  training_accuracy  validation_accuracy  
0   0.610624           0.371981             0.640098  
1   0.578942           0.487923             0.520293  
2   0.516720           0.405797             0.435697  
3   0.623412           0.565217             0.567726  
4   0.014768           0.340328             0.019560  
5   0.722238           0.753296             0.699267  
6   0.823810           0.949529             0.828362  
7   0.744522           0.986817             0.797555  
8   0.093187           0.439387             0.233741  
9   0.748850           0.840947             0.772616  
10  0.823789           0.940974             0.827384  
11  0.744090           0.960721             0.797066  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

word unigram + word bigram + char trigram -------------------------------------------¶

In [126]:
feature_union = FeatureUnion([
    ("word_unigram_count", word_unigram_count),
    ("word_unigram_tfidf", word_unigram_tfidf),
    
    ("char_bigram_count", char_bigram_count),
    ("char_bigram_tfidf", char_bigram_tfidf),
    
    ("char_trigram_count", char_trigram_count),
    ("char_trigram_tfidf", char_trigram_tfidf)
    
])
In [127]:
feature_unionAndmodel_training("word_uni+word_bi+char_tri",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["word_uni+word_bi+char_tri"] = results_df
etiket
0    6195
1    1846
2     138
Name: count, dtype: int64
No description has been provided for this image
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0    138
1    138
2    138
Name: count, dtype: int64
No description has been provided for this image
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
word_uni+word_bi+char_tri:

XGBoost,Under-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.2396088019559902
precision: 0.6010213048009339
recall: 0.2396088019559902
f1: 0.10066342121438328
Training Accuracy: 0.357487922705314
Validation Accuracy: 0.2396088019559902

word_uni+word_bi+char_tri:

CatBoostC,Under-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.5246943765281173
precision: 0.697460324720474
recall: 0.5246943765281173
f1: 0.5850971267318494
Training Accuracy: 0.4613526570048309
Validation Accuracy: 0.5246943765281173

word_uni+word_bi+char_tri:

MLPC-sgd,Under-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.4924205378973105
precision: 0.6964274470421332
recall: 0.4924205378973105
f1: 0.5582339864481477
Training Accuracy: 0.5048309178743962
Validation Accuracy: 0.4924205378973105

word_uni+word_bi+char_tri:

ExtraTreesClassifier,Under-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.5691931540342299
precision: 0.756086457759344
recall: 0.5691931540342299
f1: 0.6261420613838502
Training Accuracy: 0.5434782608695653
Validation Accuracy: 0.5691931540342299

word_uni+word_bi+char_tri:

XGBoost,Over-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.24449877750611246
precision: 0.5457468467360217
recall: 0.24449877750611246
f1: 0.10885840167008869
Training Accuracy: 0.36206618240516547
Validation Accuracy: 0.24449877750611246

word_uni+word_bi+char_tri:

CatBoostC,Over-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.7178484107579463
precision: 0.7683158388263848
recall: 0.7178484107579463
f1: 0.7378813887877547
Training Accuracy: 0.7563626580575734
Validation Accuracy: 0.7178484107579463

word_uni+word_bi+char_tri:

MLPC-sgd,Over-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.8381418092909535
precision: 0.8389920793674207
recall: 0.8381418092909535
f1: 0.8384964609802923
Training Accuracy: 0.9513586225450631
Validation Accuracy: 0.8381418092909535

word_uni+word_bi+char_tri:

ExtraTreesClassifier,Over-Sampling,word_uni+word_bi+char_tri:
accuracy: 0.7882640586797066
precision: 0.8143835519307563
recall: 0.7882640586797066
f1: 0.7261293406259197
Training Accuracy: 0.9862792574656981
Validation Accuracy: 0.7882640586797066

word_uni+word_bi+char_tri:

XGBoost,SMOTE,word_uni+word_bi+char_tri:
accuracy: 0.35794621026894863
precision: 0.6624535910948148
recall: 0.35794621026894863
f1: 0.361458297753706
Training Accuracy: 0.37928436911487756
Validation Accuracy: 0.35794621026894863

word_uni+word_bi+char_tri:

CatBoostC,SMOTE,word_uni+word_bi+char_tri:
accuracy: 0.7877750611246944
precision: 0.7735558230366292
recall: 0.7877750611246944
f1: 0.7649803870163941
Training Accuracy: 0.8362658057573312
Validation Accuracy: 0.7877750611246944

word_uni+word_bi+char_tri:

MLPC-sgd,SMOTE,word_uni+word_bi+char_tri:
accuracy: 0.8381418092909535
precision: 0.8394583973927293
recall: 0.8381418092909535
f1: 0.8387471869942711
Training Accuracy: 0.9442561205273069
Validation Accuracy: 0.8381418092909535

word_uni+word_bi+char_tri:

ExtraTreesClassifier,SMOTE,word_uni+word_bi+char_tri:
accuracy: 0.7951100244498778
precision: 0.8159438172725928
recall: 0.7951100244498778
f1: 0.739746258991426
Training Accuracy: 0.9653483992467043
Validation Accuracy: 0.7951100244498778
                   model        sampling  accuracy  precision    recall  \
0                XGBoost  Under-Sampling  0.239609   0.601021  0.239609   
1              CatBoostC  Under-Sampling  0.524694   0.697460  0.524694   
2               MLPC-sgd  Under-Sampling  0.492421   0.696427  0.492421   
3   ExtraTreesClassifier  Under-Sampling  0.569193   0.756086  0.569193   
4                XGBoost   Over-Sampling  0.244499   0.545747  0.244499   
5              CatBoostC   Over-Sampling  0.717848   0.768316  0.717848   
6               MLPC-sgd   Over-Sampling  0.838142   0.838992  0.838142   
7   ExtraTreesClassifier   Over-Sampling  0.788264   0.814384  0.788264   
8                XGBoost           SMOTE  0.357946   0.662454  0.357946   
9              CatBoostC           SMOTE  0.787775   0.773556  0.787775   
10              MLPC-sgd           SMOTE  0.838142   0.839458  0.838142   
11  ExtraTreesClassifier           SMOTE  0.795110   0.815944  0.795110   

    f1_score  training_accuracy  validation_accuracy  
0   0.100663           0.357488             0.239609  
1   0.585097           0.461353             0.524694  
2   0.558234           0.504831             0.492421  
3   0.626142           0.543478             0.569193  
4   0.108858           0.362066             0.244499  
5   0.737881           0.756363             0.717848  
6   0.838496           0.951359             0.838142  
7   0.726129           0.986279             0.788264  
8   0.361458           0.379284             0.357946  
9   0.764980           0.836266             0.787775  
10  0.838747           0.944256             0.838142  
11  0.739746           0.965348             0.795110  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

word unigram + word bigram + char bigram + char trigram -------------------------------------------¶

In [129]:
feature_union = FeatureUnion([
    ("word_unigram_count", word_unigram_count),
    ("word_unigram_tfidf", word_unigram_tfidf),
    
    ("word_bigram_count", word_bigram_count),
    ("word_bigram_tfidf", word_bigram_tfidf),
        
    ("char_bigram_count", char_bigram_count),
    ("char_bigram_tfidf", char_bigram_tfidf),
    
    ("char_trigram_count", char_trigram_count),
    ("char_trigram_tfidf", char_trigram_tfidf)
    
])
In [130]:
results_df = feature_unionAndmodel_training("word_uni+word_bi+char_bi+char_tri",feature_union,X_train, X_test, y_train, y_test,modelsAndNames,len_labels)
result_dic["word_uni+word_bi+char_bi+char_tri"] = results_df
etiket
0    6195
1    1846
2     138
Name: count, dtype: int64
No description has been provided for this image
Original Training Data Class Distributionı: Counter({0: 6195, 1: 1846, 2: 138})
etiket
0    138
1    138
2    138
Name: count, dtype: int64
No description has been provided for this image
After Under-Sampling,Training Data Class Distribution: Counter({0: 138, 1: 138, 2: 138})
After Over-Sampling, Training Data Class Distribution: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
After SMOTE ,Training Data Class Distributionı: Counter({1: 6195, 0: 6195, 2: 6195})
etiket
1    6195
0    6195
2    6195
Name: count, dtype: int64
No description has been provided for this image
word_uni+word_bi+char_bi+char_tri:

XGBoost,Under-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.013691931540342298
precision: 0.0001874689893054202
recall: 0.013691931540342298
f1: 0.0003698736933232844
Training Accuracy: 0.32850241545893716
Validation Accuracy: 0.013691931540342298

word_uni+word_bi+char_bi+char_tri:

CatBoostC,Under-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.49779951100244496
precision: 0.69280455043867
recall: 0.49779951100244496
f1: 0.5567203737246547
Training Accuracy: 0.43719806763285024
Validation Accuracy: 0.49779951100244496

word_uni+word_bi+char_bi+char_tri:

MLPC-sgd,Under-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.4963325183374083
precision: 0.6773985151234119
recall: 0.4963325183374083
f1: 0.5658100095215349
Training Accuracy: 0.4830917874396135
Validation Accuracy: 0.4963325183374083

word_uni+word_bi+char_bi+char_tri:

ExtraTreesClassifier,Under-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.6019559902200489
precision: 0.7526797826663255
recall: 0.6019559902200489
f1: 0.6533982546175323
Training Accuracy: 0.5603864734299517
Validation Accuracy: 0.6019559902200489

word_uni+word_bi+char_bi+char_tri:

XGBoost,Over-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.07383863080684597
precision: 0.5590891862039353
recall: 0.07383863080684597
f1: 0.11046800085182826
Training Accuracy: 0.35598601022329834
Validation Accuracy: 0.07383863080684597

word_uni+word_bi+char_bi+char_tri:

CatBoostC,Over-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.7012224938875306
precision: 0.7619246811117365
recall: 0.7012224938875306
f1: 0.7249652562645343
Training Accuracy: 0.7583535108958838
Validation Accuracy: 0.7012224938875306

word_uni+word_bi+char_bi+char_tri:

MLPC-sgd,Over-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.8420537897310514
precision: 0.837977812944816
recall: 0.8420537897310514
f1: 0.8398772181999302
Training Accuracy: 0.9549098735539413
Validation Accuracy: 0.8420537897310514

word_uni+word_bi+char_bi+char_tri:

ExtraTreesClassifier,Over-Sampling,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.7833740831295843
precision: 0.8107393367212573
recall: 0.7833740831295843
f1: 0.7177808334228963
Training Accuracy: 0.9866020984665053
Validation Accuracy: 0.7833740831295843

word_uni+word_bi+char_bi+char_tri:

XGBoost,SMOTE,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.20097799511002445
precision: 0.5930430834219854
recall: 0.20097799511002445
f1: 0.23997680954587133
Training Accuracy: 0.34210384718859294
Validation Accuracy: 0.20097799511002445

word_uni+word_bi+char_bi+char_tri:

CatBoostC,SMOTE,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.78239608801956
precision: 0.7639352682392094
recall: 0.78239608801956
f1: 0.7613290224414989
Training Accuracy: 0.8441216034436373
Validation Accuracy: 0.78239608801956

word_uni+word_bi+char_bi+char_tri:

MLPC-sgd,SMOTE,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.8469437652811735
precision: 0.8437840595842998
recall: 0.8469437652811735
f1: 0.8452462622973984
Training Accuracy: 0.9498520312079636
Validation Accuracy: 0.8469437652811735

word_uni+word_bi+char_bi+char_tri:

ExtraTreesClassifier,SMOTE,word_uni+word_bi+char_bi+char_tri:
accuracy: 0.7907090464547677
precision: 0.8066973414488265
recall: 0.7907090464547677
f1: 0.733048315750241
Training Accuracy: 0.9628732849071832
Validation Accuracy: 0.7907090464547677
                   model        sampling  accuracy  precision    recall  \
0                XGBoost  Under-Sampling  0.013692   0.000187  0.013692   
1              CatBoostC  Under-Sampling  0.497800   0.692805  0.497800   
2               MLPC-sgd  Under-Sampling  0.496333   0.677399  0.496333   
3   ExtraTreesClassifier  Under-Sampling  0.601956   0.752680  0.601956   
4                XGBoost   Over-Sampling  0.073839   0.559089  0.073839   
5              CatBoostC   Over-Sampling  0.701222   0.761925  0.701222   
6               MLPC-sgd   Over-Sampling  0.842054   0.837978  0.842054   
7   ExtraTreesClassifier   Over-Sampling  0.783374   0.810739  0.783374   
8                XGBoost           SMOTE  0.200978   0.593043  0.200978   
9              CatBoostC           SMOTE  0.782396   0.763935  0.782396   
10              MLPC-sgd           SMOTE  0.846944   0.843784  0.846944   
11  ExtraTreesClassifier           SMOTE  0.790709   0.806697  0.790709   

    f1_score  training_accuracy  validation_accuracy  
0   0.000370           0.328502             0.013692  
1   0.556720           0.437198             0.497800  
2   0.565810           0.483092             0.496333  
3   0.653398           0.560386             0.601956  
4   0.110468           0.355986             0.073839  
5   0.724965           0.758354             0.701222  
6   0.839877           0.954910             0.842054  
7   0.717781           0.986602             0.783374  
8   0.239977           0.342104             0.200978  
9   0.761329           0.844122             0.782396  
10  0.845246           0.949852             0.846944  
11  0.733048           0.962873             0.790709  
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

😊😊conclusion😊😊¶

In [132]:
def dictionary_to_dataframe(result_dic):
    """
    Converts a dictionary into a DataFrame and adds the gram_name as a column.

    Parameters:
        result_dic (dict): Dictionary containing results with gram_name as keys.

    Returns:
        pd.DataFrame: Combined DataFrame with gram_name as a column.
    """
    dataframes = []

    for gram_name, metrics in result_dic.items():
        temp_df = pd.DataFrame(metrics)
        temp_df.insert(0, 'gram_name', gram_name)
        dataframes.append(temp_df)

    return pd.concat(dataframes, ignore_index=True)

def visualize_metrics(result_dic):
    """
    Visualizes the metrics from a dictionary using grouped bar charts.

    Parameters:
        result_dic (dict): Dictionary containing results with gram_name as keys.

    Returns:
        None: Displays the visualization.
    """
    # Convert dictionary to DataFrame
    final_df = dictionary_to_dataframe(result_dic)

    # Group by gram_name and calculate mean scores
    grouped_df = final_df.groupby(['gram_name', 'sampling']).mean(numeric_only=True).reset_index()

    # Metrics to plot
    metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'training_accuracy', 'validation_accuracy']

    # Create figure and axes
    fig, ax = plt.subplots(figsize=(14, 8))

    # Number of gram names and metrics
    gram_sampling_labels = grouped_df[['gram_name', 'sampling']].apply(lambda x: f"{x['gram_name']}\n({x['sampling']})", axis=1)
    x = np.arange(len(gram_sampling_labels))  # The label locations
    width = 0.12  # Adjusted width for more metrics

    # Plot each metric as a separate set of bars
    for i, metric in enumerate(metrics):
        ax.bar(x + i * width, grouped_df[metric], width, label=metric.capitalize())

    # Add labels, title, and legend
    ax.set_xlabel('Gram Name and Sampling Method')
    ax.set_ylabel('Score')
    ax.set_title('Mean Scores by Gram Name, Sampling Method, and Metric')
    ax.set_xticks(x + width * len(metrics) / 2)
    ax.set_xticklabels(gram_sampling_labels, rotation=45, ha='right')
    ax.legend()

    # Add spacing between groups
    plt.tight_layout()
    plt.show()
In [133]:
 visualize_metrics(result_dic)
No description has been provided for this image